# import all packages and set plots to be embedded inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import plotly.express as px
%matplotlib inline
# import warning and ignore future warnings
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)
# preliminary assessment function
def assess_data(file, encoding='utf-8'):
# read file
df = pd.read_csv(file, encoding=encoding)
# check header
print("The First Five Observations in DataFrame")
display(df.head())
print('\n')
# check tail
print("The Last Five Observations in DataFrame")
display(df.tail())
print('\n')
# check shape of df
print("The Shape of DataFrame")
print(df.shape)
print('\n')
# check info of df
print("Basic Information of DataFrame")
print('\n')
display(df.info())
print('\n')
# check number of unique values in df
print("Number of Unique Values in DataFrame")
print('\n')
print(df.nunique())
print('\n')
# check number of missing values in df
print("Number of Missing Values in DataFrame")
print('\n')
print(df.isnull().sum())
print('\n')
# check number duplicates in df
print("Number of Duplicates in DataFrame")
print("Number of duplicates: ", df.duplicated().sum())
return df
# load data and access data
bike_df = assess_data('201902-fordgobike-tripdata.csv')
The First Five Observations in DataFrame
| duration_sec | start_time | end_time | start_station_id | start_station_name | start_station_latitude | start_station_longitude | end_station_id | end_station_name | end_station_latitude | end_station_longitude | bike_id | user_type | member_birth_year | member_gender | bike_share_for_all_trip | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 52185 | 2019-02-28 17:32:10.1450 | 2019-03-01 08:01:55.9750 | 21.0 | Montgomery St BART Station (Market St at 2nd St) | 37.789625 | -122.400811 | 13.0 | Commercial St at Montgomery St | 37.794231 | -122.402923 | 4902 | Customer | 1984.0 | Male | No |
| 1 | 42521 | 2019-02-28 18:53:21.7890 | 2019-03-01 06:42:03.0560 | 23.0 | The Embarcadero at Steuart St | 37.791464 | -122.391034 | 81.0 | Berry St at 4th St | 37.775880 | -122.393170 | 2535 | Customer | NaN | NaN | No |
| 2 | 61854 | 2019-02-28 12:13:13.2180 | 2019-03-01 05:24:08.1460 | 86.0 | Market St at Dolores St | 37.769305 | -122.426826 | 3.0 | Powell St BART Station (Market St at 4th St) | 37.786375 | -122.404904 | 5905 | Customer | 1972.0 | Male | No |
| 3 | 36490 | 2019-02-28 17:54:26.0100 | 2019-03-01 04:02:36.8420 | 375.0 | Grove St at Masonic Ave | 37.774836 | -122.446546 | 70.0 | Central Ave at Fell St | 37.773311 | -122.444293 | 6638 | Subscriber | 1989.0 | Other | No |
| 4 | 1585 | 2019-02-28 23:54:18.5490 | 2019-03-01 00:20:44.0740 | 7.0 | Frank H Ogawa Plaza | 37.804562 | -122.271738 | 222.0 | 10th Ave at E 15th St | 37.792714 | -122.248780 | 4898 | Subscriber | 1974.0 | Male | Yes |
The Last Five Observations in DataFrame
| duration_sec | start_time | end_time | start_station_id | start_station_name | start_station_latitude | start_station_longitude | end_station_id | end_station_name | end_station_latitude | end_station_longitude | bike_id | user_type | member_birth_year | member_gender | bike_share_for_all_trip | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 183407 | 480 | 2019-02-01 00:04:49.7240 | 2019-02-01 00:12:50.0340 | 27.0 | Beale St at Harrison St | 37.788059 | -122.391865 | 324.0 | Union Square (Powell St at Post St) | 37.788300 | -122.408531 | 4832 | Subscriber | 1996.0 | Male | No |
| 183408 | 313 | 2019-02-01 00:05:34.7440 | 2019-02-01 00:10:48.5020 | 21.0 | Montgomery St BART Station (Market St at 2nd St) | 37.789625 | -122.400811 | 66.0 | 3rd St at Townsend St | 37.778742 | -122.392741 | 4960 | Subscriber | 1984.0 | Male | No |
| 183409 | 141 | 2019-02-01 00:06:05.5490 | 2019-02-01 00:08:27.2200 | 278.0 | The Alameda at Bush St | 37.331932 | -121.904888 | 277.0 | Morrison Ave at Julian St | 37.333658 | -121.908586 | 3824 | Subscriber | 1990.0 | Male | Yes |
| 183410 | 139 | 2019-02-01 00:05:34.3600 | 2019-02-01 00:07:54.2870 | 220.0 | San Pablo Ave at MLK Jr Way | 37.811351 | -122.273422 | 216.0 | San Pablo Ave at 27th St | 37.817827 | -122.275698 | 5095 | Subscriber | 1988.0 | Male | No |
| 183411 | 271 | 2019-02-01 00:00:20.6360 | 2019-02-01 00:04:52.0580 | 24.0 | Spear St at Folsom St | 37.789677 | -122.390428 | 37.0 | 2nd St at Folsom St | 37.785000 | -122.395936 | 1057 | Subscriber | 1989.0 | Male | No |
The Shape of DataFrame (183412, 16) Basic Information of DataFrame <class 'pandas.core.frame.DataFrame'> RangeIndex: 183412 entries, 0 to 183411 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 duration_sec 183412 non-null int64 1 start_time 183412 non-null object 2 end_time 183412 non-null object 3 start_station_id 183215 non-null float64 4 start_station_name 183215 non-null object 5 start_station_latitude 183412 non-null float64 6 start_station_longitude 183412 non-null float64 7 end_station_id 183215 non-null float64 8 end_station_name 183215 non-null object 9 end_station_latitude 183412 non-null float64 10 end_station_longitude 183412 non-null float64 11 bike_id 183412 non-null int64 12 user_type 183412 non-null object 13 member_birth_year 175147 non-null float64 14 member_gender 175147 non-null object 15 bike_share_for_all_trip 183412 non-null object dtypes: float64(7), int64(2), object(7) memory usage: 22.4+ MB
None
Number of Unique Values in DataFrame duration_sec 4752 start_time 183401 end_time 183397 start_station_id 329 start_station_name 329 start_station_latitude 334 start_station_longitude 335 end_station_id 329 end_station_name 329 end_station_latitude 335 end_station_longitude 335 bike_id 4646 user_type 2 member_birth_year 75 member_gender 3 bike_share_for_all_trip 2 dtype: int64 Number of Missing Values in DataFrame duration_sec 0 start_time 0 end_time 0 start_station_id 197 start_station_name 197 start_station_latitude 0 start_station_longitude 0 end_station_id 197 end_station_name 197 end_station_latitude 0 end_station_longitude 0 bike_id 0 user_type 0 member_birth_year 8265 member_gender 8265 bike_share_for_all_trip 0 dtype: int64 Number of Duplicates in DataFrame Number of duplicates: 0
# make a copy of dataset
bike_df_clean = bike_df.copy()
bike_df_clean
| duration_sec | start_time | end_time | start_station_id | start_station_name | start_station_latitude | start_station_longitude | end_station_id | end_station_name | end_station_latitude | end_station_longitude | bike_id | user_type | member_birth_year | member_gender | bike_share_for_all_trip | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 52185 | 2019-02-28 17:32:10.1450 | 2019-03-01 08:01:55.9750 | 21.0 | Montgomery St BART Station (Market St at 2nd St) | 37.789625 | -122.400811 | 13.0 | Commercial St at Montgomery St | 37.794231 | -122.402923 | 4902 | Customer | 1984.0 | Male | No |
| 1 | 42521 | 2019-02-28 18:53:21.7890 | 2019-03-01 06:42:03.0560 | 23.0 | The Embarcadero at Steuart St | 37.791464 | -122.391034 | 81.0 | Berry St at 4th St | 37.775880 | -122.393170 | 2535 | Customer | NaN | NaN | No |
| 2 | 61854 | 2019-02-28 12:13:13.2180 | 2019-03-01 05:24:08.1460 | 86.0 | Market St at Dolores St | 37.769305 | -122.426826 | 3.0 | Powell St BART Station (Market St at 4th St) | 37.786375 | -122.404904 | 5905 | Customer | 1972.0 | Male | No |
| 3 | 36490 | 2019-02-28 17:54:26.0100 | 2019-03-01 04:02:36.8420 | 375.0 | Grove St at Masonic Ave | 37.774836 | -122.446546 | 70.0 | Central Ave at Fell St | 37.773311 | -122.444293 | 6638 | Subscriber | 1989.0 | Other | No |
| 4 | 1585 | 2019-02-28 23:54:18.5490 | 2019-03-01 00:20:44.0740 | 7.0 | Frank H Ogawa Plaza | 37.804562 | -122.271738 | 222.0 | 10th Ave at E 15th St | 37.792714 | -122.248780 | 4898 | Subscriber | 1974.0 | Male | Yes |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 183407 | 480 | 2019-02-01 00:04:49.7240 | 2019-02-01 00:12:50.0340 | 27.0 | Beale St at Harrison St | 37.788059 | -122.391865 | 324.0 | Union Square (Powell St at Post St) | 37.788300 | -122.408531 | 4832 | Subscriber | 1996.0 | Male | No |
| 183408 | 313 | 2019-02-01 00:05:34.7440 | 2019-02-01 00:10:48.5020 | 21.0 | Montgomery St BART Station (Market St at 2nd St) | 37.789625 | -122.400811 | 66.0 | 3rd St at Townsend St | 37.778742 | -122.392741 | 4960 | Subscriber | 1984.0 | Male | No |
| 183409 | 141 | 2019-02-01 00:06:05.5490 | 2019-02-01 00:08:27.2200 | 278.0 | The Alameda at Bush St | 37.331932 | -121.904888 | 277.0 | Morrison Ave at Julian St | 37.333658 | -121.908586 | 3824 | Subscriber | 1990.0 | Male | Yes |
| 183410 | 139 | 2019-02-01 00:05:34.3600 | 2019-02-01 00:07:54.2870 | 220.0 | San Pablo Ave at MLK Jr Way | 37.811351 | -122.273422 | 216.0 | San Pablo Ave at 27th St | 37.817827 | -122.275698 | 5095 | Subscriber | 1988.0 | Male | No |
| 183411 | 271 | 2019-02-01 00:00:20.6360 | 2019-02-01 00:04:52.0580 | 24.0 | Spear St at Folsom St | 37.789677 | -122.390428 | 37.0 | 2nd St at Folsom St | 37.785000 | -122.395936 | 1057 | Subscriber | 1989.0 | Male | No |
183412 rows × 16 columns
The missing values are in the following columns; start_station_id, start_station_name, end_station_id, end_station_name, member_birth_year, and member_gender. The missing values will not be left untreated since they will not affect the exploratory data analysis that will be conducted.
# change start_time and end_time to datetime data type
column1 = ['start_time', 'end_time']
for c in column1:
bike_df_clean[c] = pd.to_datetime(bike_df_clean[c])
# change start_station_id, end_station_id and bike_id to string data type
column2 = ['start_station_id', 'end_station_id', 'bike_id']
for c in column2:
bike_df_clean[c] = bike_df_clean[c].astype('str')
# change user_type, member_gender and bike_share_for_all_trip to category data type
column3 = ['user_type', 'member_gender', 'bike_share_for_all_trip']
for c in column3:
bike_df_clean[c] = bike_df_clean[c].astype('category')
# change member_birth_year to str
bike_df_clean['member_birth_year'] = bike_df_clean['member_birth_year'].astype('str')
# confirm data types
bike_df_clean.dtypes
duration_sec int64 start_time datetime64[ns] end_time datetime64[ns] start_station_id object start_station_name object start_station_latitude float64 start_station_longitude float64 end_station_id object end_station_name object end_station_latitude float64 end_station_longitude float64 bike_id object user_type category member_birth_year object member_gender category bike_share_for_all_trip category dtype: object
# check the summary statistic of the dataset
bike_df_clean.describe()
| duration_sec | start_station_latitude | start_station_longitude | end_station_latitude | end_station_longitude | |
|---|---|---|---|---|---|
| count | 183412.000000 | 183412.000000 | 183412.000000 | 183412.000000 | 183412.000000 |
| mean | 726.078435 | 37.771223 | -122.352664 | 37.771427 | -122.352250 |
| std | 1794.389780 | 0.099581 | 0.117097 | 0.099490 | 0.116673 |
| min | 61.000000 | 37.317298 | -122.453704 | 37.317298 | -122.453704 |
| 25% | 325.000000 | 37.770083 | -122.412408 | 37.770407 | -122.411726 |
| 50% | 514.000000 | 37.780760 | -122.398285 | 37.781010 | -122.398279 |
| 75% | 796.000000 | 37.797280 | -122.286533 | 37.797320 | -122.288045 |
| max | 85444.000000 | 37.880222 | -121.874119 | 37.880222 | -121.874119 |
It is very important to create new features from the existing ones for further analysis of the dataset. Here, I created duration_min column from duration_secs to conduct analysis on duration per minute. I also created start_hour, start_day, start_week, and start_month from start_time and end_hour, end_day, end_week, and end_month from end_time to drill down analysis on hourly, daily, weekly and monthly basis.
# create duration_min from duration_sec
bike_df_clean['duration_min'] = round(bike_df_clean['duration_sec'] / 60, 2)
# check header
bike_df_clean.head(1)
| duration_sec | start_time | end_time | start_station_id | start_station_name | start_station_latitude | start_station_longitude | end_station_id | end_station_name | end_station_latitude | end_station_longitude | bike_id | user_type | member_birth_year | member_gender | bike_share_for_all_trip | duration_min | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 52185 | 2019-02-28 17:32:10.145 | 2019-03-01 08:01:55.975 | 21.0 | Montgomery St BART Station (Market St at 2nd St) | 37.789625 | -122.400811 | 13.0 | Commercial St at Montgomery St | 37.794231 | -122.402923 | 4902 | Customer | 1984.0 | Male | No | 869.75 |
# create start_hour, start_day, start_week, and start_month from start_time
bike_df_clean['start_hour'] = bike_df_clean['start_time'].dt.hour
bike_df_clean['start_day'] = bike_df_clean['start_time'].dt.day_name()
bike_df_clean['start_week'] = bike_df_clean['start_time'].dt.week
bike_df_clean['start_month'] = bike_df_clean['start_time'].dt.month_name()
# check header
bike_df_clean.head(1)
| duration_sec | start_time | end_time | start_station_id | start_station_name | start_station_latitude | start_station_longitude | end_station_id | end_station_name | end_station_latitude | ... | bike_id | user_type | member_birth_year | member_gender | bike_share_for_all_trip | duration_min | start_hour | start_day | start_week | start_month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 52185 | 2019-02-28 17:32:10.145 | 2019-03-01 08:01:55.975 | 21.0 | Montgomery St BART Station (Market St at 2nd St) | 37.789625 | -122.400811 | 13.0 | Commercial St at Montgomery St | 37.794231 | ... | 4902 | Customer | 1984.0 | Male | No | 869.75 | 17 | Thursday | 9 | February |
1 rows × 21 columns
# create end_hour, end_day, end_week, and end_month from end_time
bike_df_clean['end_hour'] = bike_df_clean['end_time'].dt.hour
bike_df_clean['end_day'] = bike_df_clean['end_time'].dt.day_name()
bike_df_clean['end_week'] = bike_df_clean['end_time'].dt.week
bike_df_clean['end_month'] = bike_df_clean['end_time'].dt.month_name()
# check header
bike_df_clean.head(1)
| duration_sec | start_time | end_time | start_station_id | start_station_name | start_station_latitude | start_station_longitude | end_station_id | end_station_name | end_station_latitude | ... | bike_share_for_all_trip | duration_min | start_hour | start_day | start_week | start_month | end_hour | end_day | end_week | end_month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 52185 | 2019-02-28 17:32:10.145 | 2019-03-01 08:01:55.975 | 21.0 | Montgomery St BART Station (Market St at 2nd St) | 37.789625 | -122.400811 | 13.0 | Commercial St at Montgomery St | 37.794231 | ... | No | 869.75 | 17 | Thursday | 9 | February | 8 | Friday | 9 | March |
1 rows × 25 columns
# save cleaned dataframe
bike_df_clean.to_csv('fordgobike_clean_data', encoding='utf-8')
The dataset is made up of 183412 observations with 16 columns. Nine additional columns were created, bring the total columns in the dataset to 25.
The main feature of interest is the duration_min and how other features contribute to the length of ride.
The features that will help to support the main feature of interest are; start_day, end_day, start_hour, end_hour, user_type, member_gender, member_birth_year and bike_share_for_all_trip.
In this section, investigate distributions of individual variables. If you see unusual points or outliers, take a deeper look to clean things up and prepare yourself to look at relationships between variables.
Rubric Tip: The project (Parts I alone) should have at least 15 visualizations distributed over univariate, bivariate, and multivariate plots to explore many relationships in the data set. Use reasoning to justify the flow of the exploration.
Rubric Tip: Use the "Question-Visualization-Observations" framework throughout the exploration. This framework involves asking a question from the data, creating a visualization to find answers, and then recording observations after each visualisation.
# check header
bike_df_clean.head()
| duration_sec | start_time | end_time | start_station_id | start_station_name | start_station_latitude | start_station_longitude | end_station_id | end_station_name | end_station_latitude | ... | bike_share_for_all_trip | duration_min | start_hour | start_day | start_week | start_month | end_hour | end_day | end_week | end_month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 52185 | 2019-02-28 17:32:10.145 | 2019-03-01 08:01:55.975 | 21.0 | Montgomery St BART Station (Market St at 2nd St) | 37.789625 | -122.400811 | 13.0 | Commercial St at Montgomery St | 37.794231 | ... | No | 869.75 | 17 | Thursday | 9 | February | 8 | Friday | 9 | March |
| 1 | 42521 | 2019-02-28 18:53:21.789 | 2019-03-01 06:42:03.056 | 23.0 | The Embarcadero at Steuart St | 37.791464 | -122.391034 | 81.0 | Berry St at 4th St | 37.775880 | ... | No | 708.68 | 18 | Thursday | 9 | February | 6 | Friday | 9 | March |
| 2 | 61854 | 2019-02-28 12:13:13.218 | 2019-03-01 05:24:08.146 | 86.0 | Market St at Dolores St | 37.769305 | -122.426826 | 3.0 | Powell St BART Station (Market St at 4th St) | 37.786375 | ... | No | 1030.90 | 12 | Thursday | 9 | February | 5 | Friday | 9 | March |
| 3 | 36490 | 2019-02-28 17:54:26.010 | 2019-03-01 04:02:36.842 | 375.0 | Grove St at Masonic Ave | 37.774836 | -122.446546 | 70.0 | Central Ave at Fell St | 37.773311 | ... | No | 608.17 | 17 | Thursday | 9 | February | 4 | Friday | 9 | March |
| 4 | 1585 | 2019-02-28 23:54:18.549 | 2019-03-01 00:20:44.074 | 7.0 | Frank H Ogawa Plaza | 37.804562 | -122.271738 | 222.0 | 10th Ave at E 15th St | 37.792714 | ... | Yes | 26.42 | 23 | Thursday | 9 | February | 0 | Friday | 9 | March |
5 rows × 25 columns
# check basic info of dataframe
bike_df_clean.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 183412 entries, 0 to 183411 Data columns (total 25 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 duration_sec 183412 non-null int64 1 start_time 183412 non-null datetime64[ns] 2 end_time 183412 non-null datetime64[ns] 3 start_station_id 183412 non-null object 4 start_station_name 183215 non-null object 5 start_station_latitude 183412 non-null float64 6 start_station_longitude 183412 non-null float64 7 end_station_id 183412 non-null object 8 end_station_name 183215 non-null object 9 end_station_latitude 183412 non-null float64 10 end_station_longitude 183412 non-null float64 11 bike_id 183412 non-null object 12 user_type 183412 non-null category 13 member_birth_year 183412 non-null object 14 member_gender 175147 non-null category 15 bike_share_for_all_trip 183412 non-null category 16 duration_min 183412 non-null float64 17 start_hour 183412 non-null int64 18 start_day 183412 non-null object 19 start_week 183412 non-null int64 20 start_month 183412 non-null object 21 end_hour 183412 non-null int64 22 end_day 183412 non-null object 23 end_week 183412 non-null int64 24 end_month 183412 non-null object dtypes: category(3), datetime64[ns](2), float64(5), int64(5), object(10) memory usage: 31.3+ MB
bike_df_clean.end_month.unique()
array(['March', 'February'], dtype=object)
# choose a base color
base_color = sns.color_palette()[0]
# count the frequency of each unique value
freq = bike_df_clean['member_gender'].value_counts()
# get indexes of the series
gen_order = freq.index
# plot bar chart in decreasing order of frequency
sns.countplot(data=bike_df_clean,
x='member_gender',
color=base_color,
order=gen_order)
plt.xlabel('Gender of Members')
plt.ylabel('Frequency')
plt.title('Proportion of Gender distribution of Members');
Most member in the dataset identify themselves as either male or female while very few members have other sex classification. It can be observed that there are more male bike riders than there are female riders.
# choose a base color
base_color = sns.color_palette()[0]
# plot bar chart
sns.countplot(data=bike_df_clean, x='user_type', color=base_color)
plt.xlabel('Bike User Type')
plt.ylabel('Frequency')
plt.title('Proportion of Bike User Types');
There are two bike user types in the dataset; Customer and Subscribers. Most of the bike users are subscribers
# choose a base color
base_color = sns.color_palette()[0]
# plot bar chart
sns.countplot(data=bike_df_clean, x='bike_share_for_all_trip', color=base_color)
plt.ylabel('Frequency')
plt.title('Proportion of Bike Share for All Trip');
Only few bike users share their ride during the whole trip.
# choose a base color
base_color = sns.color_palette()[0]
# count the frequency of each unique value
freq = bike_df_clean['member_birth_year'].value_counts()[:20]
# get indexes of the series
order = freq.index
# plot bar chart in decreasing order of frequency
plt.figure(figsize=(12, 5))
sns.countplot(data=bike_df_clean,
x='member_birth_year',
color=base_color,
order=order)
plt.xticks(rotation=15)
plt.xlabel('Birth Year of Members', fontsize=14)
plt.ylabel('Frequency')
plt.title('Top 20 Birth Year with The Highest Frequency', fontsize=14);
As it can be observed from the figure above, most bike users are born in 1988. Also, there are large number of user that their birth year was not recorder. If those values were available, it will really affect the conclusion from this chart.
# check unique values in the member_birth_year in sorted order
np.sort(bike_df_clean.member_birth_year.unique())
array(['1878.0', '1900.0', '1901.0', '1902.0', '1910.0', '1920.0',
'1927.0', '1928.0', '1930.0', '1931.0', '1933.0', '1934.0',
'1938.0', '1939.0', '1941.0', '1942.0', '1943.0', '1944.0',
'1945.0', '1946.0', '1947.0', '1948.0', '1949.0', '1950.0',
'1951.0', '1952.0', '1953.0', '1954.0', '1955.0', '1956.0',
'1957.0', '1958.0', '1959.0', '1960.0', '1961.0', '1962.0',
'1963.0', '1964.0', '1965.0', '1966.0', '1967.0', '1968.0',
'1969.0', '1970.0', '1971.0', '1972.0', '1973.0', '1974.0',
'1975.0', '1976.0', '1977.0', '1978.0', '1979.0', '1980.0',
'1981.0', '1982.0', '1983.0', '1984.0', '1985.0', '1986.0',
'1987.0', '1988.0', '1989.0', '1990.0', '1991.0', '1992.0',
'1993.0', '1994.0', '1995.0', '1996.0', '1997.0', '1998.0',
'1999.0', '2000.0', '2001.0', 'nan'], dtype=object)
The birth year of bike riders ranges from 1878 to 2001. As expected, the young population makes up the largest part of the bike riders.
# Create bins with step-size 5
binsize = 100
bins = np.arange(0, bike_df_clean['duration_sec'].max()+binsize, binsize)
# specify fig size
plt.figure(figsize=(10, 5))
plt.hist(data=bike_df_clean, x='duration_sec', bins=bins)
plt.xlim([0, 5000])
plt.xlabel('Duration in seconds');
The distribution seems to be right skewed with very view values at the tail end. Majority of the values are concentrated between 100 and 1000
# the distribution is right skewed with long tail, a log scale should help to elucidate the distribution further
log_binsize = 0.025
bins = 10 ** np.arange(2.4, np.log10(bike_df_clean['duration_sec'].max())+log_binsize, log_binsize)
plt.figure(figsize=(10, 6))
plt.hist(data =bike_df_clean, x ='duration_sec', bins=bins)
plt.xscale('log')
plt.xlabel('Duration in seconds')
plt.title('Distribution of Bike Duration in Seconds');
Using a log scale, the distrubution of duration per seconds value was further verified to be right-skewed with majority of the users at the left part of the chart.
# choose a base color
base_color = sns.color_palette()[0]
# count the frequency of each unique value
freq = bike_df_clean['start_station_name'].value_counts()[:10]
# get indexes of the series
order = freq.index
# plot bar chart in decreasing order of frequency
plt.figure(figsize=(10, 4))
sns.countplot(data=bike_df_clean,
y='start_station_name',
color=base_color,
order=order)
plt.ylabel('Start Station Name', fontsize=14)
plt.xlabel('Frequency of Use', fontsize=14)
plt.title('Top Ten Start Station with The Highest Frequency', fontsize=16);
The start station name with the highest freqency of use is the Market St at 10th St followed by San Francisco Caltrain Station and so on as it presented in the chart above.
# choose a base color
base_color = sns.color_palette()[0]
# count the frequency of each unique value
freq = bike_df_clean['end_station_name'].value_counts()[:10]
# get indexes of the series
order = freq.index
# plot bar chart in decreasing order of frequency
plt.figure(figsize=(10, 4))
sns.countplot(data=bike_df_clean,
y='end_station_name',
color=base_color,
order=order)
plt.ylabel('End Station Name', fontsize=14)
plt.xlabel('Frequency of Use', fontsize=14)
plt.title('Top Ten End Station with The Highest Frequency', fontsize=16);
The start station name with the highest freqency of use is the San Francisco Caltrain Station followed by Market St at 10th St and so on as it presented in the chart above.
# set figsize
plt.figure(figsize = [20, 5])
# choose a base color
base_color = sns.color_palette()[0]
# set order of x-axis
order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
# BARPLOT OF START DAY ON LEFT
plt.subplot(1, 2, 1)
sns.countplot(data=bike_df_clean, x='start_day', color=base_color, order=order)
plt.xlabel('Start Day of Ride', fontsize=14)
plt.title('Distribution of Start Day of Ride', fontsize=16);
# BARPLOT OF END DAY ON RIGHT
plt.subplot(1, 2, 2)
sns.countplot(data=bike_df_clean, x='end_day', color=base_color, order=order)
plt.xlabel('End Day of Ride', fontsize=14)
plt.title('Distribution of End Day of Ride', fontsize=16);
The two charts have similar behaviour. Most rides starts and ends on Thursdays with few riders on Saturday and Sunday.
# set figsize
plt.figure(figsize = [20, 5])
# choose a base color
base_color = sns.color_palette()[0]
# set order of x-axis
order = range(1, 24)
# BARPLOT OF START HOUR ON LEFT
plt.subplot(1, 2, 1)
sns.countplot(data=bike_df_clean, x='start_hour', color=base_color, order=order)
plt.xlabel('Start Hour of Ride', fontsize=14)
plt.title('Distribution of Start Hour of Ride', fontsize=16);
# BARPLOT OF END HOUR ON RIGHT
plt.subplot(1, 2, 2)
sns.countplot(data=bike_df_clean, x='end_hour', color=base_color, order=order)
plt.xlabel('End Hour of Ride', fontsize=14)
plt.title('Distribution of End Hour of Ride', fontsize=16);
The two charts have similar behaviour.There two peak start and end hours of the day; 8th and 17th hours.
The distribution of the duration_sec seem to be right-skewed with very few values at the tail end. I used log tranformation to futher verify the distribution of the values.
Most of the features investigated have unique shapes and there was no need further cleaning and tidiness.
# check header
bike_df_clean.head()
| duration_sec | start_time | end_time | start_station_id | start_station_name | start_station_latitude | start_station_longitude | end_station_id | end_station_name | end_station_latitude | ... | bike_share_for_all_trip | duration_min | start_hour | start_day | start_week | start_month | end_hour | end_day | end_week | end_month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 52185 | 2019-02-28 17:32:10.145 | 2019-03-01 08:01:55.975 | 21.0 | Montgomery St BART Station (Market St at 2nd St) | 37.789625 | -122.400811 | 13.0 | Commercial St at Montgomery St | 37.794231 | ... | No | 869.75 | 17 | Thursday | 9 | February | 8 | Friday | 9 | March |
| 1 | 42521 | 2019-02-28 18:53:21.789 | 2019-03-01 06:42:03.056 | 23.0 | The Embarcadero at Steuart St | 37.791464 | -122.391034 | 81.0 | Berry St at 4th St | 37.775880 | ... | No | 708.68 | 18 | Thursday | 9 | February | 6 | Friday | 9 | March |
| 2 | 61854 | 2019-02-28 12:13:13.218 | 2019-03-01 05:24:08.146 | 86.0 | Market St at Dolores St | 37.769305 | -122.426826 | 3.0 | Powell St BART Station (Market St at 4th St) | 37.786375 | ... | No | 1030.90 | 12 | Thursday | 9 | February | 5 | Friday | 9 | March |
| 3 | 36490 | 2019-02-28 17:54:26.010 | 2019-03-01 04:02:36.842 | 375.0 | Grove St at Masonic Ave | 37.774836 | -122.446546 | 70.0 | Central Ave at Fell St | 37.773311 | ... | No | 608.17 | 17 | Thursday | 9 | February | 4 | Friday | 9 | March |
| 4 | 1585 | 2019-02-28 23:54:18.549 | 2019-03-01 00:20:44.074 | 7.0 | Frank H Ogawa Plaza | 37.804562 | -122.271738 | 222.0 | 10th Ave at E 15th St | 37.792714 | ... | Yes | 26.42 | 23 | Thursday | 9 | February | 0 | Friday | 9 | March |
5 rows × 25 columns
# get the lat and lon value of "Market St at 10th St" station
bike_df_clean.query('start_station_name == "Market St at 10th St"').head(1)
| duration_sec | start_time | end_time | start_station_id | start_station_name | start_station_latitude | start_station_longitude | end_station_id | end_station_name | end_station_latitude | ... | bike_share_for_all_trip | duration_min | start_hour | start_day | start_week | start_month | end_hour | end_day | end_week | end_month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 38 | 1066 | 2019-02-28 23:31:16.139 | 2019-02-28 23:49:02.792 | 58.0 | Market St at 10th St | 37.776619 | -122.417385 | 145.0 | 29th St at Church St | 37.743684 | ... | No | 17.77 | 23 | Thursday | 9 | February | 23 | Thursday | 9 | February |
1 rows × 25 columns
fig = px.scatter_mapbox(
bike_df_clean,
lat="start_station_latitude",
lon="start_station_longitude",
width=1000,
height=600,
color="start_station_id",
center={"lat": 37.776619, "lon": -122.417385}, # center map on Market St at 10th St
hover_data=["start_station_name"]
)
fig.update_layout(mapbox_style="open-street-map")
fig.show()